Library Setup

Import Data

data <- read.csv('GSE37704.csv')

Exploring Raw Data - box plot

l.data <- log(data[,-1] + 1)
boxplot(l.data, col = c(rep(2,3),rep(3,3)), main="Boxplot of log of dataset")

U test, Wilcoxon signed test

lowCount <- NULL
constantRead <- NULL
result <- NULL
Sign <- NULL
MW_U <- NULL
for(i in 1:nrow(l.data)){
  ctrl <- as.numeric( l.data[i, 1:3] )
  exp <- as.numeric( l.data[i, 4:6] )
  if( var(ctrl) == 0 ||
      var(exp) == 0){
    #variance of both group are 0, cannot use T test 
    constantRead <- c(constantRead, i)
    result<-c(result, NA)
    MW_U[i] <- NA
    Sign[i] <- NA
    next
  }
  S <- wilcox.test(ctrl, exp)
  MW <- wilcox.test(ctrl, exp,correct = F)
  Sign[i] <-  S$p.value
  MW_U[i] <- MW$p.value
}

a <- which.min(Sign)
b <- which.min(MW_U)
l.data[a,]
##     control_1 control_2 control_3 Hoxa1KN_1 Hoxa1KN_2 Hoxa1KN_3
## 104  2.772589  2.944439  2.772589  1.098612  1.098612  1.386294
l.data[b,]
##     control_1 control_2 control_3 Hoxa1KN_1 Hoxa1KN_2 Hoxa1KN_3
## 104  2.772589  2.944439  2.772589  1.098612  1.098612  1.386294

Pre-process Raw Data

rowMean <- rowMeans(data[2:7])
summary(rowMean)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      0.0      2.2    120.7    910.8    660.8 417990.0
plot_ly(x=rowMean, type='histogram')

By playing with this graph, we can see a lot ‘low count’ genes. They need to be filtered out.

df <- data[rowMean > 1200,]
plot_ly(x=rowMeans(df[,-1]), type='histogram')

PCA

PCA

rownames(df) <- df$ensgene
df <- df[,-1]


pcaResult <- princomp(df)
plot(pcaResult, type = 'l')

pcaHC <- hclust(dist(pcaResult$scores), method='ward.D2')
plot(pcaHC)

geneCluster <- cutree(pcaHC, k=3)

geneDf.Temp <- data.frame(pcaResult$scores, cluster = factor(geneCluster))
geneDf <- transform(geneDf.Temp, cluster_name = paste('Cluster', geneCluster) )
# 
# plot_ly(geneDf,
#         x = geneDf$Comp.1,
#         y = geneDf$Comp.2,
#         mode = "markers", 
#         text = rownames(df),
#         color = geneDf$cluster_name, marker = list(size = 2)) 
#  
# p <- layout(p, title = "PCA Result", 
#        xaxis = list(title = "PC 1"),
#        yaxis = list(title = "PC 2"))
# 
# p